library(tidyverse)
   library(janitor)
   library(palmerpenguins)
   library(knitr)

   penguins_raw <- read_csv(path_to_file("penguins_raw.csv")) %>%
      clean_names()
   
   opts_chunk$set(warning = FALSE, message = FALSE)
   
   #https://www.tidymodels.org/learn/statistics/k-means/
   #https://www.guru99.com/r-k-means-clustering.html
   #https://afit-r.github.io/kmeans_clustering
   library(skimr)

   skim (penguins_raw)
Data summary
Name penguins_raw
Number of rows 344
Number of columns 17
_______________________
Column type frequency:
character 9
Date 1
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
study_name 0 1.00 7 7 0 3 0
species 0 1.00 33 41 0 3 0
region 0 1.00 6 6 0 1 0
island 0 1.00 5 9 0 3 0
stage 0 1.00 18 18 0 1 0
individual_id 0 1.00 4 6 0 190 0
clutch_completion 0 1.00 2 3 0 2 0
sex 11 0.97 4 6 0 2 0
comments 290 0.16 18 68 0 10 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date_egg 0 1 2007-11-09 2009-12-01 2008-11-09 50

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sample_number 0 1.00 63.15 40.43 1.00 29.00 58.00 95.25 152.00 ▇▇▆▅▃
culmen_length_mm 2 0.99 43.92 5.46 32.10 39.23 44.45 48.50 59.60 ▃▇▇▆▁
culmen_depth_mm 2 0.99 17.15 1.97 13.10 15.60 17.30 18.70 21.50 ▅▅▇▇▂
flipper_length_mm 2 0.99 200.92 14.06 172.00 190.00 197.00 213.00 231.00 ▂▇▃▅▂
body_mass_g 2 0.99 4201.75 801.95 2700.00 3550.00 4050.00 4750.00 6300.00 ▃▇▆▃▂
delta_15_n_o_oo 14 0.96 8.73 0.55 7.63 8.30 8.65 9.17 10.03 ▃▇▆▅▂
delta_13_c_o_oo 13 0.96 -25.69 0.79 -27.02 -26.32 -25.83 -25.06 -23.79 ▆▇▅▅▂
   penguins <- penguins_raw %>%
      rename (
         bill_length = culmen_length_mm,
         bill_depth = culmen_depth_mm,
         flipper_length = flipper_length_mm,
         body_mass = body_mass_g
         ) %>%
      mutate (
         id = row_number(),
         species = word (species, 1),
         bill_length = scale(bill_length),
         bill_depth = scale(bill_depth),
         flipper_length = scale(flipper_length),
         body_mass = scale(body_mass)
         ) %>%
      select (id, species, island, sex, bill_length, bill_depth, flipper_length, body_mass) %>%
      drop_na
    library (GGally)

    ggpairs(
       data = penguins_raw,
       columns = c(10:14),
       diag = list(continuous = wrap("barDiag", colour = "blue")),
       upper = list(continuous = wrap("cor", size = 6))
         )

Visualize potential clusters

   library(factoextra)
   library(glue)

   kmeans_flex <- function (k) {
      penguins_kmeans <- kmeans(penguins[5:7], k) 
      fviz_cluster(penguins_kmeans, geom = "point", data = penguins[5:7]) +
      labs(title = glue("{k} clusters")) +
      theme (
         plot.title = element_text (margin = margin(0,0,5,0), hjust = 0.5, size = 15, family = "Lato"),
         plot.background = element_blank(),
         legend.text = element_text(hjust = 0, size = 13, family = "Lato")
         ) 
      }

   map (1:10, kmeans_flex)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Identify optimal number of clusters.

   library(broom)
   library(here)
   
   set.seed(920)
   
   penguins_clust <- 
      tibble (k = 1:10) %>%
      mutate (
         cluster = map(k, ~kmeans(penguins[5:7], .x, nstart=25)),
         glanced = map(cluster, glance)
         ) %>%
      unnest (glanced) %>%
      select (-cluster) %>%
      clean_names %>%
      write_csv (here("2020-07-28","output", "model.csv")) %>%
      rename (
         total = totss,
         intra_cluster = tot_withinss,
         inter_cluster = betweenss
      ) %>%
      pivot_longer (
         cols = total:inter_cluster,
         names_to = "residual_type",
         values_to = "residual_sum"
      )

Identify optimal number of clusters

   ggplot (penguins_clust, aes(x = k, y= residual_sum, fill = residual_type)) +
      geom_col (data = penguins_clust %>% filter (residual_type != "total")) + 
      geom_line (data = penguins_clust %>% filter (residual_type == "intra_cluster")) 

   fviz_nbclust(penguins[5:7], kmeans, method = "wss") 

   fviz_nbclust(penguins[5:7], kmeans, method = "silhouette") 

Compare optimized clusters relative to labeled data

   penguins_kmeans_extended <- 
      kmeans(penguins[5:7], 2) %>%
      augment (penguins) %>%
      rename (cluster = .cluster) %>%
      select (cluster, species, island, sex, id)

   attribute <- c("species", "island", "sex")
   
   cluster_compare <- function (attribute) {
      attribute = ensym (attribute)
      penguins_kmeans_extended %>%
      count(!!attribute,cluster) %>%
      ggplot(aes(x=!!attribute, y=cluster, fill = n)) + 
         geom_tile() 
      }

   map (attribute,cluster_compare)
## [[1]]

## 
## [[2]]

## 
## [[3]]